Notes:
getwd()
## [1] "/home/matar/GitHub/Online_Courses/Udacity-ND/P04_Explore_And _Summarize_Data/Lessons/Explore_Two_Variable"
list.files()
## [1] "lesson4_student_files" "lesson4_student.rmd" "pseudo_facebook.tsv"
library(ggplot2)
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')
names(pf)
## [1] "userid" "age"
## [3] "dob_day" "dob_year"
## [5] "dob_month" "gender"
## [7] "tenure" "friend_count"
## [9] "friendships_initiated" "likes"
## [11] "likes_received" "mobile_likes"
## [13] "mobile_likes_received" "www_likes"
## [15] "www_likes_received"
Notes: - examine relationship btw 2 continouse variables
qplot(x = age, y = friend_count, data = pf)
# alternative (ggplot1)
Response:
Notes:
summary(pf$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 20.00 28.00 37.28 50.00 113.00
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_point() + # aes() is wrapper
xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).
Notes:
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_point(alpha = 1/20) + # aes() is wrapper
xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_jitter(color = 'red', alpha = 1/20) + # aes() is wrapper
xlim(13,90)
## Warning: Removed 5188 rows containing missing values (geom_point).
Response:
Notes:
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_point(alpha = 1/20) +
xlim(13,90) +
coord_trans(y = "sqrt")
## Warning: Removed 4906 rows containing missing values (geom_point).
Notes:
# ggplot(aes(x = age, y = friendships_initiated), data = pf) +
# geom_point(alpha = 1/20, position = 'jitter') +
# coord_trans(y = 'sqrt')
## above code will produce error cuz we have casues where y is zero !
ggplot(aes(x = age, y = friendships_initiated), data = pf) +
geom_point(alpha = 1/20, position = position_jitter(h = 0)) +
coord_trans(y = 'sqrt')
Notes:
Notes:
#install.packages('dplyr')
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
pf.fc_by_age <- pf %>%
group_by(age) %>%
summarise(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %>%
arrange(age)
head(pf.fc_by_age, 20)
## # A tibble: 20 × 4
## age friend_count_mean friend_count_median n
## <int> <dbl> <dbl> <int>
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
## 7 19 333.6921 157.0 4391
## 8 20 283.4991 135.0 3769
## 9 21 235.9412 121.0 3671
## 10 22 211.3948 106.0 3032
## 11 23 202.8426 93.0 4404
## 12 24 185.7121 92.0 2827
## 13 25 131.0211 62.0 3641
## 14 26 144.0082 75.0 2815
## 15 27 134.1473 72.0 2240
## 16 28 125.8354 66.0 2364
## 17 29 120.8182 66.0 1936
## 18 30 115.2080 67.5 1716
## 19 31 118.4599 63.0 1694
## 20 32 114.2800 63.0 1443
Create your plot!
ggplot(aes(x = age, y = friend_count_mean ), data = pf.fc_by_age) +
geom_line() #geom_point()
Notes:
ggplot(aes(x = age, y = friendships_initiated), data = pf) +
coord_cartesian(xlim = c(13, 90), ylim = c(0, 1000)) +
geom_point(alpha = 0.05,
position = position_jitter(h = 0),
color = 'Orange') +
geom_line(stat = 'summary' , fun.y = mean)+
geom_line(stat = 'summary' , fun.y = quantile, fun.args = list(probs = 0.1), linetype = 2, color = 'blue') +
geom_line(stat = 'summary' , fun.y = quantile, fun.args = list(probs = 0.5), linetype = 2, color = 'blue') +
geom_line(stat = 'summary' , fun.y = quantile, fun.args = list(probs = 0.9), linetype = 2, color = 'blue')
Response:
See the Instructor Notes of this video to download Moira’s paper on perceived audience size and to see the final plot.
Notes:
Notes:
cor.test(pf$age, pf$friend_count, method = 'pearson')
##
## Pearson's product-moment correlation
##
## data: pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
#alternative
with(pf, cor.test(age, friend_count, method = 'pearson'))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
Look up the documentation for the cor.test function.
What’s the correlation between age and friend count? Round to three decimal places. Response:
Notes:
with( subset(pf, age <= 70) , cor.test(age, friend_count))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -52.592, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1780220 -0.1654129
## sample estimates:
## cor
## -0.1717245
Notes: Pearson product-memont correlation measures the strength of relationship between any 2 variables
Notes:
ggplot(aes(x = www_likes_received, y = likes_received), data = pf) +
geom_point()
Notes: The correlation coefficient is invariant under a linear transformation of either X or Y, and the slope of the regression line when both X and Y have been transformed to z-scores is the correlation coefficient.
ggplot(aes(x = www_likes_received, y = likes_received), data = pf) +
geom_point() +
xlim(0, quantile(pf$www_likes_received, 0.95)) +
ylim(0, quantile(pf$www_likes_received, 0.95)) +
geom_smooth(method = 'lm', color = 'red')
## Warning: Removed 10904 rows containing non-finite values (stat_smooth).
## Warning: Removed 10904 rows containing missing values (geom_point).
## Warning: Removed 31 rows containing missing values (geom_smooth).
What’s the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.
# correlation coefficent
cor.test(pf$www_likes_received, pf$likes_received)
##
## Pearson's product-moment correlation
##
## data: pf$www_likes_received and pf$likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9473553 0.9486176
## sample estimates:
## cor
## 0.9479902
Response:
Notes:
Notes: correlation could help us to decide which variables are related
#install.packages('alr3')
library(alr3)
## Loading required package: car
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
data(Mitchell)
?Mitchell
Create your plot!
#head(Mitchell, 50)
ggplot(aes(x = Month, y = Temp), data = Mitchell) +
geom_point()
Take a guess for the correlation coefficient for the scatterplot.
What is the actual correlation of the two variables? (Round to the thousandths place)
cor.test(Mitchell$Month, Mitchell$Temp)
##
## Pearson's product-moment correlation
##
## data: Mitchell$Month and Mitchell$Temp
## t = 0.81816, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08053637 0.19331562
## sample estimates:
## cor
## 0.05747063
#0.057 very weak correlation
Notes:
#range(Mitchell$Month)
ggplot(aes(x = Month, y = Temp), data = Mitchell) +
geom_point() +
scale_x_continuous(breaks = seq(0, 203, 12)) # to make month discreet (every 12 months) from 0 to 203, break every 12
What do you notice? Response:
Watch the solution video and check out the Instructor Notes! Notes:
Notes:
# add new column
pf$age_with_months <- pf$age + (1 - pf$dob_month / 12)
pf.fc_by_age_months <- pf %>%
group_by(age_with_months) %>%
summarise(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %>%
arrange(age_with_months)
head(pf.fc_by_age_months)
## # A tibble: 6 × 4
## age_with_months friend_count_mean friend_count_median n
## <dbl> <dbl> <dbl> <int>
## 1 13.16667 46.33333 30.5 6
## 2 13.25000 115.07143 23.5 14
## 3 13.33333 136.20000 44.0 25
## 4 13.41667 164.24242 72.0 33
## 5 13.50000 131.17778 66.0 45
## 6 13.58333 156.81481 64.0 54
Programming Assignment
ggplot(aes(x = age_with_months, y = friend_count_mean ),
data = subset(pf.fc_by_age_months, age_with_months < 71 )) +
geom_line() #geom_point()
Notes:
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
p1 <- ggplot(aes(x = age, y = friend_count_mean ),
data = subset(pf.fc_by_age, age < 70)) +
geom_line() +
geom_smooth()
p2 <- ggplot(aes(x = age_with_months, y = friend_count_mean ),
data = subset(pf.fc_by_age_months, age_with_months < 71 )) +
geom_line() +
geom_smooth()
grid.arrange(p2, p1, ncol = 1)
## `geom_smooth()` using method = 'loess'
## `geom_smooth()` using method = 'loess'
Notes:
Reflection: - scatter plots - conditional summaries (liek : means) - correlation coefficient
Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!